************************************
*********** PROJECT INFO ***********
************************************

// Record Linkage for Character-Based Names
// 1900-1910 validation
// Includes total merge numbers, accuracy, uniqueness, balance test
// Author: Hannah Postel
// Date: 10/26/2022

**************************************
************* STEP ZERO **************
***** MERGE ALL MATCHES TOGETHER ***** 
**************************************

// Import and extract ABE-standard & BYU matches
// Save each separately to avoid m:1 merges later and ensure unique matches within each method

use "$data/crosswalk_1900_1910.dta"

preserve
keep if abe_exact_standard==1
keep histid_1900 histid_1910 abe_exact_standard
save "$data/crosswalk_1900_1910-abe.dta"
restore

preserve
keep if census_tree_byu==1
keep histid_1900 histid_1910 census_tree_byu
save "$data/crosswalk_1900_1910-byu.dta"
restore

// MLP matches

**************************************
************** STEP ONE **************
***** MERGE ALL MATCHES TOGETHER ***** 
**************************************

*** 1900 ***
use "$data/1900_chinese.dta", clear
ren histid_1900 histid
ren * *_1900

// ABE 
merge 1:1 histid_1900 using "$data/crosswalk_1900_1910-abe.dta"
keep if _merge!=2
ren histid_1910 histid_1910_abe
drop _merge

// BYU
merge 1:1 histid_1900 using "$data/crosswalk_1900_1910-byu.dta"
keep if _merge!=2 // drop non-Chinese matches
ren histid_1910 histid_1910_byu
drop _merge

ren abe_exact_standard abe_std
ren census_tree_byu byu

// MLP
merge 1:1 histid_1900 using "$data/mlp_190010_all.dta"
keep if _merge!=2
gen mlp=1 if round!=.
ren histid_1910 histid_1910_mlp
drop round-_merge

// Merge in Postel matches
merge 1:1 histid_1900 using "$data/matches 1900-10.dta"
keep if _merge!=2
replace step="." if step==""
gen postel=1 if step!="."
drop unique_file1-namelast _merge
ren histid_1910 histid_1910_postel

// Keep only those matched by any algorithm. N=57,350 unmatched.
drop if mlp==. & byu==. & abe_std==. & postel==.

replace abe_std=0 if abe_std==.
replace byu=0 if byu==.
replace mlp=0 if mlp==.
replace postel=0 if postel==.

replace histid_1910_abe="." if histid_1910_abe==""
replace histid_1910_byu="." if histid_1910_byu==""
replace histid_1910_mlp="." if histid_1910_mlp==""
replace histid_1910_postel="." if histid_1910_postel==""

save "$data/1900_allmatches.dta", replace


***************************************
************** STEP TWO **************
****** MERGE TO 1910 FULL-COUNT *******
***************************************

// THIS IS JUST FOR MATCH ACCURACY - WE'LL COME BACK FOR THE OTHER VARS LATER

// ABE histid's
use "$data/1910_chinese.dta", clear
ren histid_1910 histid_1910_abe
merge 1:m histid_1910_abe using "$data/1900_allmatches.dta" // m is just for missing data

ren namefrst namefrst_1910_abe
ren namelast namelast_1910_abe
ren age age_1910_abe
ren age_1900 age_1900_abe
keep if _merge==3
keep namelast_1910_abe namefrst_1910_abe histid_1910_abe namelast_1900 namefrst_1900 ///
	histid_1900 abe_std age_1910_abe age_1900_abe
save "$data/1900_10_abematches.dta", replace

// BYU histid's
use "$data/1910_chinese.dta", clear
ren histid_1910 histid_1910_byu
merge 1:m histid_1910_byu using "$data/1900_allmatches.dta" // m is just for missing data

ren namefrst namefrst_1910_byu
ren namelast namelast_1910_byu
ren age age_1910_byu
ren age_1900 age_1900_byu
keep if _merge==3
keep namelast_1910_byu namefrst_1910_byu histid_1910_byu namelast_1900 namefrst_1900 ///
	histid_1900 byu age_1910_byu age_1900_byu
save "$data/1900_10_byumatches.dta", replace

// MLP histid's
use "$data/1910_chinese.dta", clear
ren histid_1910 histid_1910_mlp
merge 1:m histid_1910_mlp using "$data/1900_allmatches.dta"

ren namefrst namefrst_1910_mlp
ren namelast namelast_1910_mlp
ren age age_1910_mlp
ren age_1900 age_1900_mlp
keep if _merge==3
keep namelast_1900 namefrst_1900 histid_1900 histid_1910_mlp mlp ///
	namefrst_1910_mlp namelast_1910_mlp age_1910_mlp age_1900_mlp
save "$data/1900_10_mlpmatches.dta", replace

// HP histid's
use "$data/1910_chinese.dta", clear
ren histid_1910 histid_1910_postel
merge 1:m histid_1910_postel using "$data/1900_allmatches.dta"

ren namefrst namefrst_1910_postel
ren namelast namelast_1910_postel
ren age age_1910_postel
ren age_1900 age_1900_postel
keep if _merge==3
replace name3="." if name3==""
keep namelast_1900 namefrst_1900 histid_1900 histid_1910_postel postel ///
	namefrst_1910_postel namelast_1910_postel age_1910_postel age_1900_postel step
save "$data/1900_10_postelmatches.dta", replace

merge 1:1 histid_1900 using "$data/1900_10_abematches.dta"
drop _merge

merge 1:1 histid_1900 using "$data/1900_10_byumatches.dta"
drop _merge

merge 1:1 histid_1900 using "$data/1900_10_mlpmatches.dta"
drop _merge

replace abe_std=0 if abe_std==.
replace byu=0 if byu==.
replace mlp=0 if mlp==.
replace postel=0 if postel==.

save "$data/1900_1910allmatches.dta", replace


**************************************
************** STEP TWO **************
******** SAMPLE FOR ACCURACY *********
**************************************

// ABE standard
preserve
keep if abe_std==1 // 4432
set seed 1234
sample 200, count

keep namefrst_1900 namelast_1900 age_1900_abe age_1910_abe namefrst_1910_abe namelast_1910_abe
gen age_diff = age_1910_abe-age_1900_abe
order namefrst_1900 namelast_1900 namefrst_1910_abe namelast_1910_abe age_diff age_1900_abe age_1910_abe 
sort namefrst_1900 namelast_1900

count if namefrst_1900==namefrst_1910_abe & namelast_1900==namelast_1910_abe & age_diff==10 // 54 "exact"
count if namefrst_1900==namefrst_1910_abe & namelast_1900==namelast_1910_abe & age_diff==9 // 26 "likely"
count if namefrst_1900==namefrst_1910_abe & namelast_1900==namelast_1910_abe & age_diff==11 // 22 "likely"
count if namefrst_1900==namefrst_1910_abe & (age_diff==12 | age_diff==8) // 40 "plausible"
count if age_diff>12 | age_diff<8 // 5 "implausible" from age
// based on comparing remaining names, 34 likely non-matches based on names
// totals: 54 exact, 48 likely, 59 plausible, 39 implausible
restore

// BYU
preserve
keep if byu==1 // 10,622
set seed 1234
sample 200, count

keep namefrst_1900 namelast_1900 age_1900_byu age_1910_byu namefrst_1910_byu namelast_1910_byu
gen age_diff = age_1910_byu-age_1900_byu
order namefrst_1900 namelast_1900 namefrst_1910_byu namelast_1910_byu age_diff age_1900_byu age_1910_byu 
sort namefrst_1900 namelast_1900

count if namefrst_1900==namefrst_1910_byu & namelast_1900==namelast_1910_byu & age_diff==10 // 25 "exact"
count if namefrst_1900==namefrst_1910_byu & namelast_1900==namelast_1910_byu & age_diff==9 // 11 "likely"
count if namefrst_1900==namefrst_1910_byu & namelast_1900==namelast_1910_byu & age_diff==11 // 21 "likely"
count if namefrst_1900==namefrst_1910_byu & (age_diff==12 | age_diff==8) // 24 "plausible"
count if age_diff>12 | age_diff<8 // 32 "implausible" from age
// based on comparing remaining names, 38 implausible
// totals: 25 exact, 32 likely, 73 plausible, 70 implausible
restore

// MLP
preserve
keep if mlp==1 // 2048
set seed 1234
sample 200, count

keep namefrst_1900 namelast_1900 age_1900_mlp age_1910_mlp namefrst_1910_mlp namelast_1910_mlp
gen age_diff = age_1910_mlp-age_1900_mlp
order namefrst_1900 namelast_1900 namefrst_1910_mlp namelast_1910_mlp age_diff age_1900_mlp age_1910_mlp 
sort namefrst_1900 namelast_1900

count if namefrst_1900==namefrst_1910_mlp & namelast_1900==namelast_1910_mlp & age_diff==10 // 24 "exact"
count if namefrst_1900==namefrst_1910_mlp & namelast_1900==namelast_1910_mlp & age_diff==9 // 6 "likely"
count if namefrst_1900==namefrst_1910_mlp & namelast_1900==namelast_1910_mlp & age_diff==11 // 13 "likely"
count if namefrst_1900==namefrst_1910_mlp & (age_diff==12 | age_diff==8) // 7 "plausible"
count if age_diff>12 | age_diff<8 // 42 "implausible" from age
// based on comparing remaining names, 53 implausible
// totals: 24 exact, 19 likely, 62 plausible, 95 implausible
restore

// Postel
preserve
keep if postel==1  // 7272
set seed 1234
sample 200, count

keep namefrst_1900 namelast_1900 age_1900_postel age_1910_postel namefrst_1910_postel namelast_1910_postel
gen age_diff = age_1910_postel-age_1900_postel
order namefrst_1900 namelast_1900 namefrst_1910_postel namelast_1910_postel age_diff age_1900_postel age_1910_postel 
sort namefrst_1900 namelast_1900

count if namefrst_1900==namefrst_1910_postel & namelast_1900==namelast_1910_postel & age_diff==10 // 41 exact
count if namefrst_1900==namefrst_1910_postel & namelast_1900==namelast_1910_postel & age_diff==11 // 25 likely
count if namefrst_1900==namefrst_1910_postel & namelast_1900==namelast_1910_postel & age_diff==9 // 17 likely
count if namefrst_1900==namefrst_1910_postel & (age_diff==12 | age_diff==8)  // 32
count if age_diff>12 | age_diff<8  // 0
// totals: 41 exact, 42 likely, 117 plausible
restore

********************************
********** UNIQUENESS **********
********************************

use "$data/1900_allmatches.dta", clear
// Compare matches for those matched by any algorithm
// n=19,134

**** INCLUDING POSTEL ****

// found by all. n=341
// step 0: n=337
count if abe_std==1 & byu==1 & mlp==1 & postel==1

// found just by postel. n=2860
// by step: 680 / 291 / 1525 / 364
count if abe_std==0 & byu==0 & mlp==0 & postel==1

// found by postel & abe. n=1313
count if abe_std==1 & byu==0 & mlp==0 & postel==1

// found by postel & mlp. n=71
count if abe_std==0 & byu==0 & mlp==1 & postel==1

// found by postel & byu. n=919
count if abe_std==0 & byu==1 & mlp==0 & postel==1

// found by postel & byu & mlp. n=85
count if abe_std==0 & byu==1 & mlp==1 & postel==1

// found by postel & byu & abe. n=1584
count if abe_std==1 & byu==1 & mlp==0 & postel==1

// found by postel & mlp & abe. n=99
count if abe_std==1 & byu==0 & mlp==1 & postel==1



**********************************
****** 1900-10 BALANCE TEST ******
**********************************

**********************************
**** REPRESENTATIVENESS 1900 *****
**********************************

// POPULATION MEANS
use "$data/1900_chinese.dta", clear
sum age sei yrimmig

// dummy vars
gen single=1 if marst==6
replace single=0 if single==.
sum single

gen english=1 if speakeng==2
replace english=0 if english==.
sum english

gen lit2=1 if lit==4
replace lit2=0 if lit2==.
sum lit2

gen ca=1 if stateicp==71
replace ca=0 if ca==.
sum ca

merge 1:1 histid_1900 using "$data/1900_allmatches.dta"
drop _merge

// abe std
gen abe_match=1 if abe_std==1
replace abe_match=0 if abe_match==.

reg age abe_match, robust
reg sei abe_match, robust
reg yrimmig abe_match, robust
reg single abe_match, robust
reg english abe_match, robust
reg lit2 abe_match, robust
reg ca abe_match, robust

// byu
gen byu_match=1 if byu==1
replace byu_match=0 if byu_match==.

reg age byu_match, robust
reg sei byu_match, robust
reg yrimmig byu_match, robust
reg single byu_match, robust
reg english byu_match, robust
reg lit2 byu_match, robust
reg ca byu_match, robust

// mlp
gen mlp_match=1 if mlp==1
replace mlp_match=0 if mlp_match==.

reg age mlp_match, robust
reg sei mlp_match, robust
reg yrimmig mlp_match, robust
reg single mlp_match, robust
reg english mlp_match, robust
reg lit2 mlp_match, robust
reg ca mlp_match, robust


// Postel
gen postel_match=1 if postel==1
replace postel_match=0 if postel_match==.

reg age postel_match, robust
reg sei postel_match, robust
reg yrimmig postel_match, robust
reg single postel_match, robust
reg english postel_match, robust
reg lit2 postel_match, robust
reg ca postel_match, robust


************************************
***** REPRESENTATIVENESS 1910 ******
************************************

*** POPULATION MEANS *** 
use "$data/1910_chinese.dta", clear
sum age sei yrimmig

// Dummy vars
gen single=1 if marst==6
replace single=0 if single==.
sum single

gen english=1 if speakeng==2
replace english=0 if english==.
sum english

gen lit2=1 if lit==4
replace lit2=0 if lit2==.
sum lit2

gen ca=1 if stateicp==71
replace ca=0 if ca==.
sum ca

*** MERGE MATCHES TOGETHER ***
merge 1:1 histid_1910 using "$data/crosswalk_1900_1910-abe.dta"
drop if _merge==2
gen abe_match=1 if abe_exact_standard==1
replace abe_match=0 if abe_match==.
drop _merge abe_exact_standard

merge 1:1 histid_1910 using "$data/crosswalk_1900_1910-byu.dta"
drop if _merge==2
gen byu_match=1 if census_tree_byu==1
replace byu_match=0 if byu_match==.
drop _merge census_tree_byu

merge 1:1 histid_1910 using "/Users/hpostel/Dropbox/Research Ideas/Chinese matching/Matching algorithms/Data/mlp_190010_all.dta"
drop if _merge==2
gen mlp_match=1 if round!=.
replace mlp_match=0 if mlp_match==.
drop round-_merge

merge 1:1 histid_1910 using "$data/matches 1900-10.dta"
drop if _merge==2
gen postel_match=1 if step!=""
replace postel_match=0 if step==""
drop name1-_merge

save "$data/1910_matches.dta", replace


*** BALANCE ***

// ABE
reg age abe_match, robust
reg sei abe_match, robust
reg yrimmig abe_match, robust
reg single abe_match, robust
reg english abe_match, robust
reg lit2 abe_match, robust
reg ca abe_match, robust

// BYU
reg age byu_match, robust
reg sei byu_match, robust
reg yrimmig byu_match, robust
reg single byu_match, robust
reg english byu_match, robust
reg lit2 byu_match, robust
reg ca byu_match, robust

// MLP
reg age mlp_match, robust
reg sei mlp_match, robust
reg yrimmig mlp_match, robust
reg single mlp_match, robust
reg english mlp_match, robust
reg lit2 mlp_match, robust
reg ca mlp_match, robust

// Postel
reg age postel_match, robust
reg sei postel_match, robust
reg yrimmig postel_match, robust
reg single postel_match, robust
reg english postel_match, robust
reg lit2 postel_match, robust
reg ca postel_match, robust
